ML with R - KNN Project

Suhas. P. K

2024-02-06

K Nearest Neighbour Project

Since KNN is such a simple algorithm, we will just use this “Project” as a simple exercise to test your understanding of the implementation of KNN.

Iris Dataset

We’ll use the famous iris data set for this project. It’s a small data set with flower features that can be used to attempt to predict the species of an iris flower.

if(!require(ggpubr)){
  install.packages("ggpubr")
  library(ggpubr)
}

if(!require(ISLR2)){
  install.packages("ISLR2")
  library(ISLR2)
}

if(!require(ggplot2)){
  install.packages("ggplot2")
  library(ggplot2)
}

if(!require(ggdark)){
  install.packages("ggdark")
  library(ggdark)
}

if(!require(caTools)){
  install.packages("caTools")
  library(caTools)
}

if(!require(class)){
  install.packages("class")
  library(class)
}

if(!require(plotly)){
  install.packages("plotly")
  library(plotly)
}

Loading iris dataset

df <- iris
head(df)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
  • Dataset structure
str(df)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
  • Dataset summary
summary(df)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 

Standardize data

  • The iris data set has all its features in the same order of magnitude, but its good practice (especially with KNN) to standardize features in your data.

Use scale() to standardize the feature columns of the iris dataset. Set this standardized version of the data as a new variable.

standard.feature <- scale( df[1:4])
  • Checking that the scaling worked by checking the variance of one of the new columns.
var(standard.feature[,1])
## [1] 1
  • Join the standardized data with the response/target/label column (the column with the species names.
final.data <- cbind(standard.feature,df[5])
head(final.data)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1   -0.8976739  1.01560199    -1.335752   -1.311052  setosa
## 2   -1.1392005 -0.13153881    -1.335752   -1.311052  setosa
## 3   -1.3807271  0.32731751    -1.392399   -1.311052  setosa
## 4   -1.5014904  0.09788935    -1.279104   -1.311052  setosa
## 5   -1.0184372  1.24503015    -1.335752   -1.311052  setosa
## 6   -0.5353840  1.93331463    -1.165809   -1.048667  setosa

Train-test split

set.seed(101)

sample <- sample.split(final.data$Species, SplitRatio = 0.70)
train <- subset(final.data, sample = TRUE)
test <- subset(final.data, sample = FALSE)

Build a KNN model.

# Fit KNN model
k <- 10
knn_model <- knn(train[, 1:2], test[, 1:2], train[, 5], k = k)

# Combine predictions with test data
test_with_pred <- cbind(test, Predicted_Species = knn_model)

# Plotting using ggplot2 and ggpubr
ggplot(test_with_pred, aes(x = Sepal.Width, y = Sepal.Length, color = Predicted_Species)) +
  geom_point(size = 3, alpha = 0.9) +
  stat_ellipse(aes(fill = Predicted_Species, color = Predicted_Species), geom = "polygon", alpha = 0.4,color = "white") +
  labs(title = paste("KNN Cluster Probability Ellipses (k = ", k, ")", sep = ""),
       x = "Sepal.Length", y = "Sepal.Width", fill = "Predicted Species", color = "Predicted Species") +
  scale_color_manual(values = c("setosa" = "blue", "versicolor" = "green", "virginica" = "red")) +
  scale_fill_manual(values = c("setosa" = "lightblue", "versicolor" = "lightgreen", "virginica" = "lightcoral")) +
  dark_theme_light()
## Inverted geom defaults of fill and color/colour.
## To change them back, use invert_geom_defaults().

# Fit KNN model
k <- 5
knn_model <- knn(train[, 1:2], test[, 1:2], train[, 5], k = k)

# Combine predictions with test data
test_with_pred <- cbind(test, Predicted_Species = knn_model)

# Plotting using ggplot2 and ggpubr
knn.plt <- ggplot() +
  geom_point(data = train, aes(x = Sepal.Width, y = Sepal.Length, color = "Train"), size = 3, alpha = 0.7) +
  geom_point(data = test_with_pred, aes(x = Sepal.Width, y = Sepal.Length, color = Predicted_Species), size = 3, alpha = 0.7) +
  stat_ellipse(data = train, aes(x = Sepal.Width, y = Sepal.Length, fill = "Train"), geom = "polygon", alpha = 0.2, color = "black") +
  stat_ellipse(data = test_with_pred, aes(x = Sepal.Width, y = Sepal.Length, fill = Predicted_Species), geom = "polygon", alpha = 0.2, color = "black") +
  labs(title = paste("KNN Cluster Probability Ellipses (k =", k, ")", sep = ""),
       x = "Sepal.Length", y = "Sepal.Width", fill = "Dataset") +
  scale_color_manual(values = c("Train" = "blue", "setosa" = "green", "versicolor" = "yellow", "virginica" = "red")) +
  scale_fill_manual(values = c("Train" = "lightblue", "setosa" = "lightgreen", "versicolor" = "lightcoral", "virginica" = "lightyellow")) +
  dark_theme_light()

ggplty <- ggplotly(knn.plt) 

ggplty